library(tidyverse)
## ── Attaching packages ─────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(dplyr)
library(stringr)
library(readr)
library(readxl)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map
library(maps)
library(gsheet)

Read original data

library(gsheet)

heart_disease_stratify = gsheet2tbl('docs.google.com/spreadsheets/d/1W-xVHLoeZj37wrZdX-dP_SVouZKg5VmTpFdmOLWByR4/edit?usp=sharing') %>%

  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(mortality_rate = data_value) %>%

  mutate(state = state.name[match(state, state.abb)]) %>% 
  select(-data_source, -geographic_level, -class, -topic, -data_value_footnote, -data_value_footnote_symbol, -topic_id, -location_id ) 

heart_disease = heart_disease_stratify %>% 
  filter(stratification1 == "Overall", stratification2 == "Overall") %>% 
  select(-stratification1, -stratification2, -stratification_category1, -stratification_category2) 

heart_disease$mortality_rate[is.na(heart_disease$mortality_rate)] = 0

heart_disease = 
heart_disease %>% 
  group_by(state) %>% 
  summarise(mortality_rate = mean(mortality_rate))

Add air quality data

airquality_2015 =
  gsheet2tbl('https://docs.google.com/spreadsheets/d/1CRJzMI0QbqU34BZZQ1bg3BnZPM_oUfVUCVPOo4AnoB0/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  select(state, pm2_5) %>%   
  group_by(state) %>% 
  summarize(pm2.5 = sum(pm2_5))

Add obesity data

obesity_data = gsheet2tbl('https://docs.google.com/spreadsheets/d/1zlB2cOOMvD-IJIiGQ6YOyQ6QEXEKVAkrPTzxZrYci28/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = name) %>%
  rename(obesity_percentage = obesity) %>%
  select(state, obesity_percentage) 
data_with_obesity = left_join(heart_disease, obesity_data)
## Joining, by = "state"

Add stroke data

stroke_data = read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
  janitor::clean_names() %>%
  rename(stroke_value=data_value)%>%
  rename(state = location_abbr) %>%
  mutate(state = state.name[match(state, state.abb)])%>%

  select(state,stroke_value) %>% 
  group_by(state) %>% 
  filter(!is.na(stroke_value)) %>% 
  summarize(stroke_value = sum(stroke_value)) 
## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   LocationAbbr = col_character(),
##   LocationDesc = col_character(),
##   GeographicLevel = col_character(),
##   DataSource = col_character(),
##   Class = col_character(),
##   Topic = col_character(),
##   Data_Value = col_double(),
##   Data_Value_Unit = col_character(),
##   Data_Value_Type = col_character(),
##   Data_Value_Footnote_Symbol = col_character(),
##   Data_Value_Footnote = col_character(),
##   StratificationCategory1 = col_character(),
##   Stratification1 = col_character(),
##   StratificationCategory2 = col_character(),
##   Stratification2 = col_character(),
##   TopicID = col_character(),
##   LocationID = col_character(),
##   `Location 1` = col_character()
## )

Add income

income_data = read_excel("data/income_2015.xlsx", range = "A4:D55") %>%
  janitor::clean_names() %>%
  rename(state = united_states, median_income = x55117, income_standard_error = x253) 
data_with_income = left_join(heart_disease,income_data, by = "state")
data_income_obesity = left_join(income_data,data_with_obesity, by = "state")



smoking_data = gsheet2tbl("https://docs.google.com/spreadsheets/d/1ZU7uuqV-EZM81hE4kq0nFiPJHUMJj82fnXVllGFHJ00/edit?usp=sharing") %>% 
  filter(YEAR == "2015-2016") %>% 

  mutate(year = 2015) %>% 
  rename(state = LocationDesc) %>% 
  select(-YEAR) %>%
  filter(!is.na(Data_Value)) %>%

  select(year, state, Data_Value) %>% 

  select(year, state, Data_Value) %>% 
  rename(tobacco_comsumption = Data_Value) %>% 
  group_by(state) %>% 
  summarise(tobacco_consumption = sum(tobacco_comsumption))


data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")

data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")




data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")


final_data_export = left_join(stroke_data, data_income_obesity_smoking_air, by = "state") 

Find the association between smoke and heart disease mortality

final_data = gsheet2tbl('https://docs.google.com/spreadsheets/d/1ifmyU22AaB81PvMDCCRrsIqtifa0FuKhH5MpI_FU0Sc/edit?usp=sharing')
## Warning: Missing column names filled in: 'X1' [1]
 final_data %>%
  mutate(state = forcats::fct_reorder(factor(state), tobacco_consumption)) %>%
  ggplot(aes(x = mortality_rate, y = tobacco_consumption)) + 
  geom_point(aes(color = state), alpha = .5) +
  stat_smooth(method = "lm", col = "red") +
  labs(
    title = "Tabacco Consumption Accross states"
  ) +
  theme(text = element_text(size = 8), axis.text = element_text(angle = 60, hjust = 1), legend.position = "bottom")
## Warning: Removed 14 rows containing non-finite values (stat_smooth).
## Warning: Removed 14 rows containing missing values (geom_point).

lm(mortality_rate~tobacco_consumption, data = final_data) %>%
summary()
## 
## Call:
## lm(formula = mortality_rate ~ tobacco_consumption, data = final_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -124.235  -46.749    0.038   40.108  115.435 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         241.8406    45.2029   5.350 5.56e-06 ***
## tobacco_consumption   0.8384     0.3669   2.285   0.0285 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.37 on 35 degrees of freedom
##   (14 observations deleted due to missingness)
## Multiple R-squared:  0.1298, Adjusted R-squared:  0.105 
## F-statistic: 5.222 on 1 and 35 DF,  p-value: 0.02848

Find the association between obesity and heart disease mortality

  final_data %>%
  ggplot(aes(x = obesity_percentage, y = mortality_rate)) + 
  geom_point(aes(color = state), alpha = .5) +
  stat_smooth(method = "lm", col = "red") +
  labs(
    x = "Obesity(%)",
    y = "Mortality",
    title = "Heart Disease Mortality vs Obesity"
  ) +
  theme(text = element_text(size = 8), 
        axis.text = element_text(angle = 60, hjust = 1), 
        legend.position = "bottom",
        plot.title = element_text(hjust = 0.5))
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

  lm(mortality_rate ~ obesity_percentage, data = final_data) %>% 
  broom::tidy()
## # A tibble: 2 x 5
##   term               estimate std.error statistic  p.value
##   <chr>                 <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)           -17.8     46.9     -0.380 7.06e- 1
## 2 obesity_percentage     12.2      1.58     7.71  6.09e-10

Find the association between income and heart disease mortality

final_data_income = 
  final_data %>% 
  mutate(state = forcats::fct_reorder(factor(state), median_income)) 

final_data_income %>% 
  ggplot(aes(x = mortality_rate, y = median_income, color = state)) +
  geom_point() +
  stat_smooth(method = "lm", col = "red") +
  theme(text = element_text(size = 8), axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom") + 

  #Add the title and the name for x and y axis. 
  labs(
    title = "Association between Income and Heart Disease Mortality Rate",
    x = "Mortality Rate",
    y = "median_income"
  )
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

lm(mortality_rate ~ median_income, data = final_data_income) %>% 
  summary()
## 
## Call:
## lm(formula = mortality_rate ~ median_income, data = final_data_income)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.924 -29.753  -7.434  28.817 102.081 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.102e+02  3.994e+01  15.279  < 2e-16 ***
## median_income -4.826e-03  7.058e-04  -6.838  1.3e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.39 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.4935, Adjusted R-squared:  0.4829 
## F-statistic: 46.76 on 1 and 48 DF,  p-value: 1.303e-08

From the lm result, we can observe that median_income is a very significant variable with a p value of 1.3e-08. This indicates there is a strong association between income and heart disease mortality rate

Find the association between airquality and heart disease mortality

## make scatterplot 
final_data %>% 
  mutate(state = fct_reorder(state, mortality_rate)) %>% 
  ggplot(aes(x = mortality_rate, y = pm2.5, color = state)) + 
  geom_point() +
  ggtitle("Airquality VS Mortality Rate ") +
  stat_smooth(method = "lm", col = "red") +
  theme(legend.position = "bottom",
        legend.direction = "horizontal",
        axis.text.x = element_text(angle = 90, size = 6),
        legend.key.size = unit(0.05, "cm")) + 
  labs(x = "Mortality Rate",
       y = "PM2.5") 
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

## fit simple linear regression model 
air_regression<-lm(final_data$mortality_rate~final_data$pm2.5) 
summary(air_regression)
## 
## Call:
## lm(formula = final_data$mortality_rate ~ final_data$pm2.5)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -89.08 -38.02 -15.49  34.66 139.55 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      3.382e+02  1.427e+01  23.698   <2e-16 ***
## final_data$pm2.5 9.734e-04  4.673e-03   0.208    0.836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.34 on 48 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.0009031,  Adjusted R-squared:  -0.01991 
## F-statistic: 0.04339 on 1 and 48 DF,  p-value: 0.8359

From the scatterplot, we can see that the points are spread randomlly. However, the relationship between pm2.5 and mortality rate is unclear. For the states, with low pm2.5, some of them have low mortality rate and some of them have high mortality rate. After we fit the simple regression model, the p-value for pm2.5 is 0.836, so it is a non-significant variable.

Find the association between average stroke value and average heart disease mortality, and SLR for all counties using wtfdata

mydata1=heart_disease_stratify%>%
  filter(stratification1 == "Overall", stratification2 == "Overall")%>%
  select(state, location_desc, mortality_rate)%>%
  filter(!is.na(mortality_rate))
  
mydata2=gsheet2tbl('https://docs.google.com/spreadsheets/d/1AZkDl8sNrTDnEX3Fhp1MeWNKNj9rnXaKaMbpVGndv5s/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(stroke_value = data_value) %>%
  mutate(state = state.name[match(state, state.abb)])%>%
  filter(stratification1 == "Overall", stratification2 == "Overall")%>%
  select(state, location_desc, stroke_value)%>%
  filter(!is.na(stroke_value))
                                                                                                                    
comdata = left_join(mydata1,mydata2)  
## Joining, by = c("state", "location_desc")
final_data %>%
  mutate(state = forcats::fct_reorder(state,stroke_value)) %>%
  ggplot(aes(x = mortality_rate, y = stroke_value)) + 
  geom_point(aes(color = state), alpha = .5)
## Warning: Removed 1 rows containing missing values (geom_point).

comdata %>%
  mutate(state = forcats::fct_reorder(state,stroke_value)) %>%
  ggplot(aes(x = mortality_rate, y = stroke_value)) + 
  geom_point(aes(color = state), alpha = .5)
## Warning: Removed 2 rows containing missing values (geom_point).

stroke_slm=lm(mortality_rate~stroke_value,data=comdata)
summary(stroke_slm)
## 
## Call:
## lm(formula = mortality_rate ~ stroke_value, data = comdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -374.77  -48.06   -7.98   37.16  955.31 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  143.23631    5.98863   23.92   <2e-16 ***
## stroke_value   2.74650    0.07525   36.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.07 on 3269 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2895, Adjusted R-squared:  0.2893 
## F-statistic:  1332 on 1 and 3269 DF,  p-value: < 2.2e-16

find association between heart disease, stroke based, gender and race

new_heart_gender_data=gsheet2tbl('docs.google.com/spreadsheets/d/1W-xVHLoeZj37wrZdX-dP_SVouZKg5VmTpFdmOLWByR4/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(mortality_rate = data_value) %>%
  rename(gender=stratification1)%>%
  rename(race=stratification2)%>%
  mutate(state = state.name[match(state, state.abb)])%>%
  filter(gender%in%c("Male","Female"))%>%
  filter(race=="Overall")%>%
  select(state,location_desc, mortality_rate,gender)

new_stroke_gender_data=gsheet2tbl('https://docs.google.com/spreadsheets/d/1AZkDl8sNrTDnEX3Fhp1MeWNKNj9rnXaKaMbpVGndv5s/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(stroke_value = data_value) %>%
  rename(gender=stratification1)%>%
  rename(race=stratification2)%>%
  mutate(state = state.name[match(state, state.abb)])%>%
  filter(gender%in%c("Male","Female"))%>%
  filter(race=="Overall")%>%
  select(state, location_desc, stroke_value,gender)
new_gender_data=left_join(new_heart_gender_data,new_stroke_gender_data)%>%
  filter(!is.na(stroke_value))
## Joining, by = c("state", "location_desc", "gender")
gender_reg=lm(mortality_rate~stroke_value+gender+stroke_value*gender,data=new_gender_data)
summary(gender_reg)
## 
## Call:
## lm(formula = mortality_rate ~ stroke_value + gender + stroke_value * 
##     gender, data = new_gender_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -485.17  -48.03   -7.52   38.73 1158.15 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             112.49516    6.27099  17.939  < 2e-16 ***
## stroke_value              2.21351    0.08028  27.573  < 2e-16 ***
## genderMale              111.99707    8.56167  13.081  < 2e-16 ***
## stroke_value:genderMale   0.61112    0.10802   5.657  1.6e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.36 on 6466 degrees of freedom
## Multiple R-squared:  0.5931, Adjusted R-squared:  0.5929 
## F-statistic:  3141 on 3 and 6466 DF,  p-value: < 2.2e-16
ggplot(new_gender_data,aes(x=mortality_rate,y=stroke_value))+
  geom_point(aes(color=gender), alpha = .5)

new_heart_race_data=gsheet2tbl('docs.google.com/spreadsheets/d/1W-xVHLoeZj37wrZdX-dP_SVouZKg5VmTpFdmOLWByR4/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(mortality_rate = data_value) %>%
  rename(gender=stratification1)%>%
  rename(race=stratification2)%>%
  mutate(state = state.name[match(state, state.abb)])%>%
  filter(gender=="Overall")%>%
  filter(!(race=="Overall"))%>%
  select(state, location_desc, mortality_rate, race)

new_stroke_race_data=gsheet2tbl('https://docs.google.com/spreadsheets/d/1AZkDl8sNrTDnEX3Fhp1MeWNKNj9rnXaKaMbpVGndv5s/edit?usp=sharing') %>%
  janitor::clean_names() %>%
  rename(state = location_abbr) %>%
  rename(stroke_value = data_value) %>%
  rename(gender=stratification1)%>%
  rename(race=stratification2)%>%
  mutate(state = state.name[match(state, state.abb)])%>%
  filter(gender=="Overall")%>%
  filter(!(race=="Overall"))%>%
  select(state, location_desc, stroke_value,race)
new_race_data=left_join(new_heart_race_data,new_stroke_race_data)%>%
  filter(!is.na(stroke_value))
## Joining, by = c("state", "location_desc", "race")
race_reg=lm(mortality_rate~stroke_value+race+stroke_value*race,data=new_race_data)
summary(race_reg)
## 
## Call:
## lm(formula = mortality_rate ~ stroke_value + race + stroke_value * 
##     race, data = new_race_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -430.83  -48.04   -7.57   40.13 1110.23 
## 
## Coefficients:
##                                             Estimate Std. Error t value
## (Intercept)                                 175.9394    13.1181  13.412
## stroke_value                                  2.5703     0.1566  16.414
## raceAsian and Pacific Islander              -50.8487    15.7666  -3.225
## raceBlack                                    95.0941    15.4000   6.175
## raceHispanic                                -62.8309    14.8478  -4.232
## raceWhite                                   -23.2946    15.0278  -1.550
## stroke_value:raceAsian and Pacific Islander  -2.0687     0.2011 -10.285
## stroke_value:raceBlack                       -1.1864     0.1722  -6.892
## stroke_value:raceHispanic                    -1.1271     0.1901  -5.929
## stroke_value:raceWhite                        0.1906     0.1830   1.041
##                                             Pr(>|t|)    
## (Intercept)                                  < 2e-16 ***
## stroke_value                                 < 2e-16 ***
## raceAsian and Pacific Islander               0.00127 ** 
## raceBlack                                   7.01e-10 ***
## raceHispanic                                2.35e-05 ***
## raceWhite                                    0.12116    
## stroke_value:raceAsian and Pacific Islander  < 2e-16 ***
## stroke_value:raceBlack                      6.02e-12 ***
## stroke_value:raceHispanic                   3.21e-09 ***
## stroke_value:raceWhite                       0.29775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.27 on 6670 degrees of freedom
##   (13 observations deleted due to missingness)
## Multiple R-squared:  0.595,  Adjusted R-squared:  0.5945 
## F-statistic:  1089 on 9 and 6670 DF,  p-value: < 2.2e-16
ggplot(new_race_data, aes(x=mortality_rate,y=stroke_value))+
  geom_point(aes(color=race))
## Warning: Removed 13 rows containing missing values (geom_point).

* From the interaction, we find that the p-value of “stroke_value : gender” is very small, indicate “gender” modifies the relationship between geart disease mortality and stroke mortality. With the plot, we can find that males have higher heart disease mortality rate, but the stroke mortality rate is similar. * From the linear regression model, we find that the Black has the highest heart disease mortality rate, and the Hispanic the least. ???????

Map

library(plotly)

map_data = final_data %>% 
    mutate(state = tolower(state)) 


states <- map_data("state") %>% 
  rename(state = region)
  
a = left_join(states, map_data, by = "state") %>% 
  mutate(text_label = str_c("Region: ", state, 'Mortality rate: ', mortality_rate) ) 

a$text_label <- with(a, paste(state, '<br>', "Mortality_rate", mortality_rate))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
  scope = 'usa',
  projection = list(type = 'albers usa'),
  showlakes = TRUE,
  lakecolor = toRGB('white')
)

p <- plot_geo(a, locationmode = 'USA-states') %>%
  add_trace(
    z = ~mortality_rate, text = ~text_label, locations = ~us,
    color = ~mortality_rate, colors = 'Purples'
  ) %>%
  colorbar(title = "Millions USD") %>%
  layout(
    title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
    geo = g
  )
## Warning: Ignoring 10 observations
p
final_data = final_data %>%
  janitor::clean_names()

total_lm = lm(mortality_rate ~ tobacco_consumption + median_income + obesity_percentage + pm2_5 + stroke_value, data = final_data) %>%
  fortify()


ggplot(total_lm, aes(x = .fitted, y = .resid)) + geom_point() + 
  stat_smooth(method = "lm", col = "red") +
  labs(
    x = "Fitted value",
    y = "Residual",
    title = "Residuals vs Fitted"
  ) +
  theme(plot.title = element_text(hjust = 0.5))